1. Data Collection¶

a. Loading the dataset¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import seaborn as sns
import plotly.express as px

boxify_dataset = pd.read_csv(r"E:\DATA ANALYSIS\UpGrad\Capstone_Project\BOXIFY_UPGRAD_CAPSTONE PROJECT\Boxify Dataset.csv")
boxify_dataset.head()
Out[1]:
Order File_Type SKU_number SoldFlag SoldCount MarketingType ReleaseNumber New_Release_Flag StrengthFactor PriceReg ReleaseYear ItemCount LowUserPrice LowNetPrice
0 2 Historical 1737127 0.0 0.0 D 15 1 682743.0 44.99 2015 8 28.97 31.84
1 3 Historical 3255963 0.0 0.0 D 7 1 1016014.0 24.81 2005 39 0.00 15.54
2 4 Historical 612701 0.0 0.0 D 0 0 340464.0 46.00 2013 34 30.19 27.97
3 6 Historical 115883 1.0 1.0 D 4 1 334011.0 100.00 2006 20 133.93 83.15
4 7 Historical 863939 1.0 1.0 D 2 1 1287938.0 121.95 2010 28 4.00 23.99

b. Summary of DataFrames¶

In [50]:
boxify_dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198917 entries, 0 to 198916
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order             198917 non-null  int64  
 1   File_Type         198917 non-null  object 
 2   SKU_number        198917 non-null  int64  
 3   SoldFlag          75996 non-null   float64
 4   SoldCount         75996 non-null   float64
 5   MarketingType     198917 non-null  object 
 6   ReleaseNumber     198917 non-null  int64  
 7   New_Release_Flag  198917 non-null  int64  
 8   StrengthFactor    198917 non-null  float64
 9   PriceReg          198917 non-null  float64
 10  ReleaseYear       198917 non-null  int64  
 11  ItemCount         198917 non-null  int64  
 12  LowUserPrice      198917 non-null  float64
 13  LowNetPrice       198917 non-null  float64
dtypes: float64(6), int64(6), object(2)
memory usage: 21.2+ MB

c. Statistical Summary of Dataset¶

In [51]:
boxify_dataset.describe()
Out[51]:
Order SKU_number SoldFlag SoldCount ReleaseNumber New_Release_Flag StrengthFactor PriceReg ReleaseYear ItemCount LowUserPrice LowNetPrice
count 198917.000000 1.989170e+05 75996.000000 75996.000000 198917.000000 198917.000000 1.989170e+05 198917.000000 198917.000000 198917.000000 198917.000000 198917.000000
mean 106483.543242 8.613626e+05 0.171009 0.322306 3.412202 0.642248 1.117115e+06 90.895243 2006.016414 41.426283 30.982487 46.832053
std 60136.716784 8.699794e+05 0.376519 1.168615 3.864243 0.479340 1.522090e+06 86.736367 9.158331 37.541215 69.066155 128.513236
min 2.000000 5.000100e+04 0.000000 0.000000 0.000000 0.000000 6.275000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 55665.000000 2.172520e+05 0.000000 0.000000 1.000000 0.000000 1.614188e+05 42.000000 2003.000000 21.000000 4.910000 17.950000
50% 108569.000000 6.122080e+05 0.000000 0.000000 2.000000 1.000000 5.822240e+05 69.950000 2007.000000 32.000000 16.080000 33.980000
75% 158298.000000 9.047510e+05 0.000000 0.000000 5.000000 1.000000 1.430083e+06 116.000000 2011.000000 50.000000 40.240000 55.490000
max 208027.000000 3.960788e+06 1.000000 73.000000 99.000000 1.000000 1.738445e+07 12671.480000 2018.000000 2542.000000 14140.210000 19138.790000

2. Data Cleaning and Preprocessing¶

a. Handling missing values¶

In [89]:
#Analysing the number of missing values in the dataset
missing_values_summary=boxify_dataset.isnull().sum()
missing_values_summary
Out[89]:
Order                    0
File_Type                0
SKU_number               0
SoldFlag            122921
SoldCount           122921
MarketingType            0
ReleaseNumber            0
New_Release_Flag         0
StrengthFactor           0
PriceReg                 0
ReleaseYear              1
ItemCount                0
LowUserPrice             0
LowNetPrice              0
dtype: int64
In [90]:
#Filling the missing values  of soldflag and soldCount with zero
boxify_dataset['SoldCount'].fillna(0, inplace=True)
boxify_dataset['SoldFlag'].fillna(0, inplace=True)

missing_values_summary=boxify_dataset.isnull().sum()
missing_values_summary
Out[90]:
Order               0
File_Type           0
SKU_number          0
SoldFlag            0
SoldCount           0
MarketingType       0
ReleaseNumber       0
New_Release_Flag    0
StrengthFactor      0
PriceReg            0
ReleaseYear         1
ItemCount           0
LowUserPrice        0
LowNetPrice         0
dtype: int64

b.Checking the Column Data types¶

In [5]:
boxify_dataset.dtypes
Out[5]:
Order                  int64
File_Type             object
SKU_number             int64
SoldFlag             float64
SoldCount            float64
MarketingType         object
ReleaseNumber          int64
New_Release_Flag       int64
StrengthFactor       float64
PriceReg             float64
ReleaseYear            int64
ItemCount              int64
LowUserPrice         float64
LowNetPrice          float64
InventoryTurnover    float64
dtype: object
In [5]:
# Changing the data types of ReleaseYear, MarketingType, SoldFlag, SoldCount and FileType

boxify_dataset['MarketingType'] = boxify_dataset['MarketingType'].astype('category')
boxify_dataset['New_Release_Flag'] = boxify_dataset['New_Release_Flag'].astype('category')
boxify_dataset['ReleaseYear'] = pd.to_datetime(boxify_dataset['ReleaseYear'], format='%Y', errors='coerce')
boxify_dataset['SKU_number'] = boxify_dataset['SKU_number'].astype('string')
boxify_dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198917 entries, 0 to 198916
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Order             198917 non-null  int64         
 1   File_Type         198917 non-null  object        
 2   SKU_number        198917 non-null  string        
 3   SoldFlag          75996 non-null   float64       
 4   SoldCount         75996 non-null   float64       
 5   MarketingType     198917 non-null  category      
 6   ReleaseNumber     198917 non-null  int64         
 7   New_Release_Flag  198917 non-null  category      
 8   StrengthFactor    198917 non-null  float64       
 9   PriceReg          198917 non-null  float64       
 10  ReleaseYear       198916 non-null  datetime64[ns]
 11  ItemCount         198917 non-null  int64         
 12  LowUserPrice      198917 non-null  float64       
 13  LowNetPrice       198917 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(6), int64(3), object(1), string(1)
memory usage: 18.6+ MB

3. Exploratory Data Analysis¶

a. Frequency Distribution of File Types¶

In [36]:
fig = px.histogram(boxify_dataset.File_Type, x="File_Type",title="Distribution of File Types")
fig.update_layout(autosize=False,
    width=400,
    height=400)
fig.show()

b. Frequency Distribution of Marketing Type¶

In [39]:
custom_colors = ['#8481DD', '#F6D173']  # Add more colors if you have more categories
fig = px.pie(
    boxify_dataset,
    names='MarketingType',
    title='Frequency Distribution of Marketing Types',
    hole=0.4,  # Optional: makes it a donut chart for better readability
    color_discrete_sequence=custom_colors
)
fig.update_traces(textinfo='percent')

fig.update_layout(
    autosize=False,
    width=400,
    height=400
)

fig.show()
In [ ]:
 

c. Sales Trend Over Time¶

In [73]:
# Aggregate sales by ReleaseYear
yearly_sales = valid_years.groupby('ReleaseYear')['SoldCount'].sum().reset_index()

# Plot the time series
fig = px.line(yearly_sales, x='ReleaseYear', y='SoldCount', 
              title='Yearly Sales Trends',
              labels={'ReleaseYear': 'Year', 'SoldCount': 'Total Sold Count'})

fig.update_traces(mode='lines+markers', marker=dict(symbol='circle', size=5))
fig.update_layout(xaxis=dict(range=[1900, 2024]), autosize=False, width=600, height=600)
fig.show()

d. Sales By Marketing Types¶

In [40]:
market_compare = boxify_dataset.groupby(['File_Type','MarketingType'])[['Order']].count()
market_compare
C:\Users\hp\AppData\Local\Temp\ipykernel_16072\2634249690.py:1: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Out[40]:
Order
File_Type MarketingType
Active D 62852
S 60069
Historical D 35119
S 40877
In [56]:
market_com = market_compare.reset_index()
color_map = {
    'D': '#7CC674',
    'S': '#F4B678'
}
fig = px.bar(
    market_com,
    x='File_Type',
    y='Order',
    color='MarketingType',
    title='Top Selling Products',
    labels={'File_Type': 'File Types', 'Order': 'Order Count'},
    barmode='group',
    color_discrete_map=color_map
)

fig.update_traces(marker_line_color='black', marker_line_width=1, width=0.35)

fig.update_layout(title_text='The Order Counts for Active and Historical Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
C:\Users\hp\anaconda3\Lib\site-packages\plotly\express\_core.py:1958: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

e. Marketing Type counts for SKUs that have and have not been sold in the last 6 months¶

In [76]:
marketing_counts = boxify_dataset.groupby(['SoldFlag', 'MarketingType']).size().reset_index(name='count')
custom_colors = ['#A2D9D9', '#F4B678']
fig0 = px.pie(marketing_counts[marketing_counts['SoldFlag'] == 0], 
              names='MarketingType', values='count', 
              title='Marketing Types for SKUs Not Sold in the Last 6 Months', hole = 0.4,
             color_discrete_sequence=custom_colors)

# Filter data for SoldFlag = 1
fig1 = px.pie(marketing_counts[marketing_counts['SoldFlag'] == 1], 
              names='MarketingType', values='count', 
              title='Marketing Types for SKUs Sold in the Last 6 Months',hole = 0.4,
              color_discrete_sequence=custom_colors)
fig0.update_layout(
    autosize=False,
    width=550,
    height=400
)
fig1.update_layout(
    autosize=False,
    width=550,
    height=400
)
fig0.show()
fig1.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_16072\2290916386.py:1: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

f. Top-selling Products¶

In [76]:
top_selling_products = boxify_dataset.groupby('SKU_number')['SoldCount'].sum().reset_index()

# Sorting SoldCount in descending order and considering top 10 products
top_selling_products = top_selling_products.sort_values(by='SoldCount', ascending=False).head(10)

# Plotting the top-selling products
fig = px.bar(top_selling_products, x='SKU_number', y='SoldCount', 
             title='Top Selling Products',
             labels={'SKU_number': 'SKU Number', 'SoldCount': 'Total Sold Count'})

fig.update_traces(marker_color='green', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Top 10 Selling Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()

g. Top Selling Categories¶

In [82]:
top_selling_products = boxify_dataset.groupby('MarketingType')['SoldCount'].sum().reset_index()

# Sorting SoldCount in descending order and considering top 10 products
top_selling_products = top_selling_products.sort_values(by='SoldCount', ascending=False).head(10)

# Plotting the top-selling products
fig = px.bar(top_selling_products, x='MarketingType', y='SoldCount', 
             title='Top Selling Products',
             labels={'MarketingType': 'Marketing Type', 'SoldCount': 'Total Sold Count'})

fig.update_traces(marker_color='orange', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Top 10 Selling Products Marketing Type', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_2904\2991398427.py:1: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

h. Comparison of Prices for the Hsitorical and Active products¶

In [121]:
price_comparison = boxify_dataset.groupby('File_Type')[['PriceReg', 'LowUserPrice', 'LowNetPrice']].mean().reset_index()
fig = px.bar(price_comparison, x='File_Type', y=['PriceReg', 'LowUserPrice', 'LowNetPrice'], 
             title='Top Selling Products',
             labels={'File_Type': 'File Types'}, barmode='group')

fig.update_traces(marker_line_color='black', marker_line_width=1.5, width=0.25)
fig.update_layout(title_text='The Average Prices for Active and Historical Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()

i. Calculation of Stock Levels¶

In [128]:
current_stock_items = boxify_dataset.groupby('SKU_number')['ItemCount'].sum().reset_index()
current_stock_items

#To identify the stock_count below defined threshold value
low_stock_threshold = 10
low_stock_items = boxify_dataset[boxify_dataset['ItemCount'] < low_stock_threshold]
low_stocks_count = low_stock_items.shape[0]
low_stocks_count_percentage = (low_stocks_count/boxify_dataset.shape[0])*100
print(f"Low stock item count: {low_stocks_count}")
print(f"Low stock item count percentage: {low_stocks_count_percentage:.2f}%")
Low stock item count: 3379
Low stock item count percentage: 1.70%
In [149]:
#Plotting the number of current stocks
initial_stock = boxify_dataset.groupby('SKU_number')['ItemCount'].sum().reset_index()

# Calculate the total sold count for each SKU_number
total_sold = boxify_dataset.groupby('SKU_number')['SoldCount'].sum().reset_index()

# Merge initial stock and total sold count
stock_data = initial_stock.merge(total_sold, on='SKU_number', how='left')

stock_data['CurrentStock'] = stock_data['ItemCount'] - stock_data['SoldCount']
stock_data = stock_data.sort_values(by='CurrentStock', ascending=False).head(10)
fig = px.bar(stock_data, x='SKU_number', y='CurrentStock', title='Current Stock by SKU Number', 
             labels={'SKU_number': 'SKU Number', 'CurrentStock': 'Current Stock'})

# Show the plot
fig.update_traces(marker_color='#B323A5', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Number of Current Stocks for top-10 SKUs', title_x=0.5, autosize=False, width=600, height=600)
fig.show()

j. Calculating Inventory Turn-over¶

(i). Inventory turnover by SKU¶
In [53]:
boxify_dataset['InventoryTurnover'] = boxify_dataset['SoldCount'] / boxify_dataset['ItemCount']

boxify_dataset.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
boxify_dataset.dropna(subset=['InventoryTurnover'], inplace=True)
boxify_turnOver_TopSKU=boxify_dataset.sort_values(by='InventoryTurnover', ascending=False).head(10)

# Plotting Inventory Turnover using Plotly
fig = px.bar(boxify_turnOver_TopSKU, x='SKU_number', y='InventoryTurnover', 
             title='Inventory Turnover by SKUs',
             labels={'SKU_number': 'SKU Number', 'InventoryTurnover': 'Inventory Turnover Ratio'})

# Update the layout for better readability
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False, 
                  title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#5c9f94', marker_line_color='black', marker_line_width=1.5)

# Show the plot
fig.show()

k. Total Stocks to Total Sales¶

In [23]:
# Calculate total inventory and total sales
total_inventory = boxify_dataset['ItemCount'].sum()
total_sales = boxify_dataset['SoldCount'].sum()

# Calculate stock-to-sales ratio
stock_to_sales_ratio = total_inventory / total_sales

print(f"Stock-to-Sales Ratio: {stock_to_sales_ratio:.2f}")
Stock-to-Sales Ratio: 33.60
In [48]:
boxify_dataset['stock_to_sales_ratio'] = boxify_dataset['ItemCount'].sum() / boxify_dataset['SoldCount'].sum()

stock_to_sales = boxify_dataset.groupby('ReleaseYear')['stock_to_sales_ratio'].sum().reset_index()
stock_to_sales

fig = px.line(stock_to_sales, x='ReleaseYear', y='stock_to_sales_ratio', 
             title='Stock-to-Sales Ratio by ReleaseYear',
             labels={'ReleaseYear': 'Year', 'stock_to_sales_ratio': 'Stock-to-Sales Ratio'})

# Update the layout for better readability
fig.update_layout(xaxis= {'categoryorder': 'total descending'} , showlegend=False, 
                  title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(mode='lines+markers', marker=dict(symbol='circle', size=5))
# Show the plot
fig.show()
C:\Users\hp\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning:

The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

In [56]:
boxify_dataset['StockToSalesRatio'] = boxify_dataset['ItemCount'] / boxify_dataset['SoldCount'].replace(0, pd.NA)

# Handle infinite or NaN values
boxify_dataset.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
boxify_dataset.dropna(subset=['StockToSalesRatio'], inplace=True)
boxify_stocksRatio = boxify_dataset.sort_values(by='StockToSalesRatio', ascending=False).head(10)

# Plotting Stock-to-Sales Ratio using Plotly
fig = px.bar(boxify_stocksRatio, x='SKU_number', y='StockToSalesRatio', 
             title='Stock-to-Sales Ratio by SKU',
             labels={'SKU_number': 'SKU Number', 'StockToSalesRatio': 'Stock-to-Sales Ratio'})

# Update the layout for better readability
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#c7728c', marker_line_color='black', marker_line_width=1.5)
# Show the plot
fig.show()

l. Reorder Point¶

In [58]:
lead_time_days = 7
safety_stock_percentage = 0.2

# Calculate average daily demand
boxify_dataset['Average_Daily_Demand'] = boxify_dataset['SoldCount'] / 180  # the sales count is for six months

# Calculate demand during lead time
boxify_dataset['Demand_During_Lead_Time'] = boxify_dataset['Average_Daily_Demand'] * lead_time_days

# Calculate safety stock
boxify_dataset['Safety_Stock'] = boxify_dataset['Demand_During_Lead_Time'] * safety_stock_percentage

# Calculate reorder points
boxify_dataset['Reorder_Point'] = boxify_dataset['Demand_During_Lead_Time'] + boxify_dataset['Safety_Stock']

# Get the top 10 products by reorder points
top_10_reorder_points = boxify_dataset.nlargest(10, 'Reorder_Point')
fig = px.bar(top_10_reorder_points, x='SKU_number', y='Reorder_Point', 
             title='Top 10 Products by Reorder Points', 
             labels={'SKU_number':'SKU Number', 'Reorder_Point':'Reorder Point'})
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False, 
                  title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#90be6d', marker_line_color='black', marker_line_width=1.5)
fig.show()

m. Top SKUs by Yearly ReorderPoint¶

In [29]:
#boxify_dataset['ReleaseYear'] = boxify_dataset['ReleaseYear'].astype(int)

# Calculate yearly sales for each SKU
yearly_sales = boxify_dataset.groupby(['SKU_number', 'ReleaseYear']).agg({'SoldCount': 'sum'}).reset_index()

# Calculate average yearly sales per SKU
average_sales = yearly_sales.groupby('SKU_number').agg({'SoldCount': 'mean'}).reset_index()
average_sales.rename(columns={'SoldCount': 'AvgYearlySales'}, inplace=True)

# Assume lead time in years (for simplicity, assume 1 year lead time)
lead_time = 1

# Calculate lead time demand
average_sales['LeadTimeDemand'] = average_sales['AvgYearlySales'] * lead_time

# Calculate standard deviation of sales per SKU to estimate variability
sales_variability = yearly_sales.groupby('SKU_number').agg({'SoldCount': 'std'}).reset_index()
sales_variability.rename(columns={'SoldCount': 'SalesStdDev'}, inplace=True)

reorder_data = pd.merge(average_sales, sales_variability, on='SKU_number')

# Assume a safety stock level (e.g., 1.65 for 95% service level)
safety_factor = 1.65
reorder_data['SafetyStock'] = safety_factor * reorder_data['SalesStdDev']

# Calculate reorder points
reorder_data['ReorderPoint'] = reorder_data['LeadTimeDemand'] + reorder_data['SafetyStock']

# Plotting reorder points for top 10 SKUs with highest reorder points
top_10_reorder_points = reorder_data.nlargest(10, 'ReorderPoint')

fig = px.bar(top_10_reorder_points, x='SKU_number', y='ReorderPoint', title='Top 10 SKUs by Yearly Reorder Point', 
             text=top_10_reorder_points['ReorderPoint'].map(lambda x: f'{x:.2f}'))

fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False, 
                  title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#ff6361', marker_line_color='black', marker_line_width=1.5, textposition='outside' )
fig.show()

top_10_reorder_points = top_10_reorder_points.merge(yearly_sales[['SKU_number', 'ReleaseYear']], on='SKU_number', how='left')
top_10_reorder_points
Out[29]:
SKU_number AvgYearlySales LeadTimeDemand SalesStdDev SafetyStock ReorderPoint ReleaseYear
0 2271238 1.5 1.5 2.121320 3.500179 5.000179 1900-01-01
1 2271238 1.5 1.5 2.121320 3.500179 5.000179 2012-01-01
2 3545620 1.5 1.5 2.121320 3.500179 5.000179 1900-01-01
3 3545620 1.5 1.5 2.121320 3.500179 5.000179 2000-01-01
4 2264246 1.0 1.0 1.414214 2.333452 3.333452 1900-01-01
5 2264246 1.0 1.0 1.414214 2.333452 3.333452 2011-01-01
6 3604079 1.0 1.0 1.414214 2.333452 3.333452 1900-01-01
7 3604079 1.0 1.0 1.414214 2.333452 3.333452 2001-01-01
8 1781265 0.5 0.5 0.707107 1.166726 1.666726 2015-01-01
9 1781265 0.5 0.5 0.707107 1.166726 1.666726 2016-01-01
10 2263930 0.5 0.5 0.707107 1.166726 1.666726 1900-01-01
11 2263930 0.5 0.5 0.707107 1.166726 1.666726 2011-01-01
12 2264421 0.5 0.5 0.707107 1.166726 1.666726 1900-01-01
13 2264421 0.5 0.5 0.707107 1.166726 1.666726 2011-01-01
14 2265599 0.5 0.5 0.707107 1.166726 1.666726 1900-01-01
15 2265599 0.5 0.5 0.707107 1.166726 1.666726 2011-01-01
16 2268109 0.5 0.5 0.707107 1.166726 1.666726 1900-01-01
17 2268109 0.5 0.5 0.707107 1.166726 1.666726 2010-01-01
18 270587 0.5 0.5 0.707107 1.166726 1.666726 2011-01-01
19 270587 0.5 0.5 0.707107 1.166726 1.666726 2012-01-01
In [92]:
fig = px.histogram(boxify_dataset, 
                   x="New_Release_Flag", 
                   color="New_Release_Flag",
                   facet_col="SoldFlag",
                   title="SoldFlag Counts for SKUs with and without Plans for Future Release",
                   labels={"New_Release_Flag": "New Release Flag", "count": "Count"},
                   category_orders={"SoldFlag": [0, 1]},
                   color_discrete_sequence=["#009596", "#C9190B"])

# Update layout for better visualization
fig.update_layout(
    barmode='group',
    height=600,
    width=1000)
C:\Users\hp\anaconda3\Lib\site-packages\plotly\express\_core.py:1958: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

o. Relation between Marketing Type and Price by Sales Performance¶

In [97]:
color_map = {
    0: '#9b19f5',
    1: '#ffa300'
}

# Plotting PriceReg vs MarketingType
fig_price_reg = px.box(boxify_dataset, 
                       x="MarketingType", 
                       y="PriceReg", 
                       color="SoldFlag",
                       title="PriceReg vs MarketingType",
                       labels={"MarketingType": "Marketing Type", "PriceReg": "PriceReg"},
                       category_orders={"SoldFlag": [0, 1]},
                      color_discrete_map=color_map)
                       
fig_price_reg.update_yaxes(range=[0, 750])
fig_price_reg.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_price_reg.show()

# Plotting LowUserPrice vs MarketingType
fig_low_user_price = px.box(boxify_dataset, 
                            x="MarketingType", 
                            y="LowUserPrice", 
                            color="SoldFlag",
                            title="LowUserPrice vs MarketingType",
                            labels={"MarketingType": "Marketing Type", "LowUserPrice": "LowUserPrice"},
                            category_orders={"SoldFlag": [0, 1]},
                           color_discrete_map=color_map)
fig_low_user_price.update_yaxes(range=[0, 750])
fig_low_user_price.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_low_user_price.show()

# Plotting LowNetPrice vs MarketingType
fig_low_net_price = px.box(boxify_dataset, 
                           x="MarketingType", 
                           y="LowNetPrice", 
                           color="SoldFlag",
                           title="LowNetPrice vs MarketingType",
                           labels={"MarketingType": "Marketing Type", "LowNetPrice": "LowNetPrice"},
                           category_orders={"SoldFlag": [0, 1]},
                          color_discrete_map=color_map)
fig_low_net_price.update_yaxes(range=[0, 750])
fig_low_net_price.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_low_net_price.show()

p. Relationship among each variables¶

In [130]:
# Select only numeric columns from the DataFrame
numeric_data = boxify_dataset.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
corr_matrix = numeric_data.corr()

# Create a Plotly heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='PiYG',
    zmin=-1, zmax=1,
    text=corr_matrix.values,
    texttemplate="%{text:.2f}",
    textfont={"size":10},
))

# Update layout
fig.update_layout(
    title='Correlation Heatmap',
    xaxis_nticks=36,
    yaxis_nticks=36,
    title_x=0.5, autosize=False, width=700, height=700
)

fig.show()

q. Item Count Distribution and Sales Performance¶

In [157]:
item_count_sales = boxify_dataset.groupby('ItemCount')['SoldCount'].sum().reset_index()
item_count_avg_sales = boxify_dataset.groupby('ItemCount')['SoldCount'].mean().reset_index()

# Merge the two dataframes
item_count_summary = pd.merge(item_count_sales, item_count_avg_sales, on='ItemCount', suffixes=('_Total', '_Average'))


# Plot the distribution and average of ItemCount in relation to sales performance
fig = px.scatter(
    item_count_summary,
    x='ItemCount',
    y=['SoldCount_Total'],
    title='Item Count Distribution and Sales Performance',
    labels={'ItemCount': 'Item Count', 'SoldCount': 'Sales Count'},
    opacity=0.9
)
fig.update_layout(title_x=0.5, autosize=False, width=700, height=700)
fig.update_traces(marker=dict(size=5))
fig.show()